Data import

# International sunspot number from SIDC, Brussels (1818 - 2024)
SIDC_SILSO <- read_delim("data/raw/SIDC_SILSO.csv", delim = ";", col_names = FALSE)  %>% 
  transmute(Date = as.Date(paste0(X1, "-", X2, "-", X3)), 
            Sunspots = as.numeric(X5))

## Ambulance Data related to acute cerebrovascular accidents
brain_data <- read_csv("data/raw/brain_data.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data related to acute cerebrovascular accidents
brain_data_long <- read_csv("data/raw/brain_data_long.csv") %>%
    mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))

Correlation of quantitative variables

Our dataset contains a large number of quantitative variables:

# Data preprocessing
brain_data_clear <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "var", "K_Sum")) ) %>%
  filter(complete.cases(.))

# Function for displaying correlations with a neutral color for values close to 0
color_correlation_fixed <- function(data, mapping, ...) {
  
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  cor_value <- cor(x, y, use = "complete.obs")
  
  # Defining color
  color <- if (abs(cor_value) < 0.1) {
    scales::alpha("gray50", 0.5)           ## Gray for weak correlations
  } else if (cor_value > 0) {
    scales::alpha("blue", abs(cor_value))  ## Blue for positive correlations
  } else {
    scales::alpha("red", abs(cor_value))   ## Red for negative correlations
  }
  
  # Filter for small correlation values
  cor_label <- ifelse(abs(cor_value) < 0.01, "0", sprintf("%.2f", cor_value))
  
  ggplot(data.frame()) + 
    annotate("text", x = 0.5, y = 0.5, label = cor_label, size = 10, color = color ) +
    theme_void() 
}

# Function for trend line with a golden color
golden_smooth <- function(data, mapping, ...) {
  ggplot(data, mapping) +
    geom_point(alpha = 0.4, size = 0.5, color = "black") + 
    geom_smooth(color = "gold", ...) + 
    theme_custom
}

# Plot construction
ggpairs(
  brain_data_clear, 
  progress = FALSE,
  upper = list(continuous = color_correlation_fixed), ## Highlighting correlations
  lower = list(continuous = golden_smooth) )+         ## Golden trend line
theme_custom +
  theme(
    axis.text.x = element_text(size = 10),  
    axis.text.y = element_text(size = 10) )   

When analyzing the network graph, three groups of features can be identified:

cor_data <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "Sum", "var")) ) %>%
  rename(Temperature = Temp_mean, Precipitation = H2O, 
         `F10.7 index` = F10.7, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>% 
  cor(use = "pairwise.complete.obs") 
  
corrplot(cor_data, method = 'number', type = 'lower', diag = FALSE)

network_plot(cor_data, min_cor = .1)

Some features within the groups are highly correlated!

brain_data %>% 
  drop_na() %>% 
  ggplot(aes(Temp_mean, Pressure))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Temperature")+
  theme_custom

brain_data %>% 
  ggplot(aes(Sunspots, F10.7))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Sunspot Number", y = "F10.7 index")+
  theme_custom

brain_data %>% 
  ggplot(aes(Dst_mean, Ap_mean))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Dst index", y = "Daily Average Ap index")+
  theme_custom

Based on further analysis, the following variables will be excluded:

However, all the variables considered do not have a noticeable relationship with the number of ambulance calls:

  brain_data %>% 
  rename(Temperature = Temp_mean, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>% 
  pivot_longer(cols = c(Temperature, Sunspots, `Ap index`, `Dst index`)) %>% 
  ggplot(aes(x = value, y = EMS))+
  geom_point()+
  geom_smooth(se = FALSE, method = "gam", colour = "goldenrod2", linewidth = 2)+
  theme_custom +
  labs(x = "", y = "Daily Emergency Calls") +  
  facet_wrap(~name, scales = "free")

Data Decomposition

The number of sunspots is directly linked to the solar activity cycles, as confirmed by the graph:

brain_data %>% 
  select(!c(Pressure, F10.7, Ap_mean)) %>% 
  ggplot(aes(Date, Sunspots))+
  geom_line(alpha = 0.3, colour = "royalblue2")+
  geom_smooth(method = "gam", formula = y ~ s(x, k = 10),  se = FALSE,
              colour = "orangered4", linewidth = 2) +
  geom_vline(xintercept = as.Date("2009-01-01"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  geom_vline(xintercept = as.Date("2019-12-31"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  scale_x_date(date_breaks = "2 year", date_labels = "%Y")+
  labs(x = "", y = "Sunspot number")+
  theme_custom

Our data covers the complete 24th solar cycle, spanning the period from December 2008 to December 2019.

Working with time series allows us to decompose the data to identify trends and cyclic patterns:

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(EMS) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("EMS"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Temp_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Temperature"), 

  nrow = 1
)

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Ap_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Ap Index"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Dst_var) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Dst amplitude"), 
  
  nrow = 1
)

Visualizing Association

Solar Cycles

When examining solar cycles over the past 200 years, sunspots can be represented as a factor variable - Sunspots level:

Sunspots_quantile <- SIDC_SILSO %>% 
             filter(Sunspots != -1, between(Date, as.Date("1823-01-01"), as.Date("2020-01-01"))) %>% 
             pull(Sunspots) %>% quantile()
  
SIDC_SILSO %>% 
  filter(Sunspots != -1, 
         between(Date, as.Date("1823-01-01"), as.Date("2020-01-01"))) %>% 
  mutate(Sunspots_level = cut(Sunspots, 
                              breaks = c(-Inf, quantile(Sunspots)[2], quantile(Sunspots)[3], 
                                         quantile(Sunspots)[4], Inf), labels  = 
                                c("Low", "Medium", "High", "Extreme"))) %>% 
  ggplot(aes(Date, Sunspots, group = 1))+
  geom_line(aes(color = Sunspots_level), linewidth = 2)+
  geom_hline(yintercept = Sunspots_quantile[2], colour = "black", linewidth = 1, linetype = "dashed")+
  geom_hline(yintercept = Sunspots_quantile[3], colour = "black", linewidth = 1, linetype = "dashed")+
  geom_hline(yintercept = Sunspots_quantile[4], colour = "black", linewidth = 1, linetype = "dashed")+
  scale_colour_manual(values =  c("Low" = "darkorange", "Medium" = "darkorange2",  
                                  "High" = "darkorange3", "Extreme" = "darkorange4"), 
                       labels = c("Low" = "Low (0-22)", "Medium" = "Medium (23-65)", 
                                  "High" = "High (66-129)","Extreme" = "Extreme (>129)")) +
  scale_x_date( date_breaks = "20 year", date_labels = "%Y")+
  scale_y_continuous(n.breaks = 10)+
  labs(x = "", y = "Sunspot number", color = "Sunspots level", title = "Solar Cycles (1823 - 2023)")+
  theme_custom+
  annotate("text", x = as.Date("1815-01-01"), 
           y = c( (Sunspots_quantile[1] + Sunspots_quantile[2])/2, (Sunspots_quantile[2] + Sunspots_quantile[3])/2, 
                  (Sunspots_quantile[3] + Sunspots_quantile[4])/2, (Sunspots_quantile[4] + Sunspots_quantile[5])/3), 
           label = c("Q1", "Q2", "Q3", "Q4"), size = 10)

EMS ~ Sunspots level

No difference in the number of emergency calls on days with different Sunspots_level is observed:

brain_data %>% 
  mutate(Sunspots_level = case_when(Sunspots > Sunspots_quantile[4] ~ "Extreme", 
                                    Sunspots > Sunspots_quantile[3] ~ "High", 
                                    Sunspots > Sunspots_quantile[2] ~ "Medium", 
                                    TRUE ~ "Low") %>% factor(fct_recode(c("Low", "Medium", "High", "Extreme")))) %>% 
 ggplot(aes(x = EMS,  y = Sunspots_level, fill = Sunspots_level )) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(
    values = c("Low" = "darkorange", "Medium" = "darkorange2", "High" = "darkorange3","Extreme" = "darkorange4"),
    labels = c("Low" = "Low (0-22)", "Medium" = "Medium (23-65)",
               "High" = "High (66-129)","Extreme" = "Extreme (>129)")) +
  labs(x = "Daily Ambulance Calls", y = "",  fill = "Sunspots level",
       title = "Relationship Between Sunspot Levels and Ambulance Calls")+
  theme_custom 

brain_data %>% 
  mutate(Sunspots_level = case_when(Sunspots > Sunspots_quantile[4] ~ "Extreme", 
                                    Sunspots > Sunspots_quantile[3] ~ "High", 
                                    Sunspots > Sunspots_quantile[2] ~ "Medium", 
                                    TRUE ~ "Low") %>% factor(fct_recode(c("Low", "Medium", "High", "Extreme")))) %>%                             
  pairwise_t_test(EMS ~ Sunspots_level, p.adjust.method = "holm") %>% 
  transmute(`Group 1` = group1, `Group 2` = group2, 
            `p-value` = p, `p.adjusted` = p.adj) %>%
  flextable() 

Group 1

Group 2

p-value

p.adjusted

Low

Medium

0.0549

0.330

Low

High

0.0675

0.338

Medium

High

0.9470

1.000

Low

Extreme

0.1730

0.691

Medium

Extreme

0.7710

1.000

High

Extreme

0.8090

1.000

Dst ~ Sunspots

At the same time, a noticeable difference in the Dst index is observed as the sunspot number changes:

brain_data %>%
  filter(Dst_level != "Severe") %>%
  mutate(Dst_level = fct_relevel(Dst_level, c("Quiet", "Weak", "Moderate", "Major", "Severe"))) %>% 
  ggplot(aes(Dst_level, Sunspots))+
  geom_jitter(alpha = 0.5, size = 1, width = 0.2) +  
  geom_boxplot(alpha = 0.5, outliers = FALSE, fill = "goldenrod2", linewidth = 1)+
  labs( x = "Geomagnetic activity level",  y = "Sunspot Number" ) +
  theme_custom

EMS ~ DayOff

  • The difference between the average EMS values for weekdays and weekends is statistically significant.
  • The average EMS value is higher on weekdays (13.33) compared to weekends (11.02).
mean_values <- brain_data %>%
   mutate(DayOff = fct_recode(DayOff, "Working days" = "No", "Non-working days" = "Yes")) %>% 
  group_by(DayOff) %>%
  summarise(mean_EMS = round(mean(EMS, na.rm = TRUE), 2), .groups = "drop")

brain_data %>% 
  mutate(DayOff = fct_recode(DayOff, "Working days" = "No", "Non-working days" = "Yes")) %>% 
ggplot(aes(x = DayOff, y = EMS)) +
  geom_boxplot(aes(fill = DayOff)) +
  geom_text(data = mean_values, 
            aes(x = DayOff, y = mean_EMS, label = mean_EMS), 
            color = "black", size = 8, fontface = "bold", vjust = -1) +
  scale_fill_brewer(palette = "Dark2", direction = -1)+
  scale_y_continuous(limits = c(NA, max(brain_data$EMS) + 3))+
  labs(y = "Daily Emergency Calls", x = "")+
  theme_custom +
  theme(legend.position = "none")+
  stat_pvalue_manual(t_test(data = brain_data %>%  
                              mutate(DayOff = fct_recode(DayOff, 
                                                         "Working days" = "No", 
                                                         "Non-working days" = "Yes")),  
                            EMS ~ DayOff),
                     label = "T-test, p = {p}", 
                     size = 10, 
                     y.position = max(brain_data$EMS) + 0.1)

EMS ~ Season

The distribution of EMS calls across seasons is not statistically significant (paired t-test with Holm adjustment) p.adjusted > 0.05).

brain_data %>% 
mutate(Season = fct_relevel(Season, c("Winter", "Spring", "Summer", "Autumn"))) %>% 
ggplot(aes(x = Season, y = EMS, fill = Season)) +
  geom_boxplot(alpha = 0.7, show.legend = FALSE) +
  scale_fill_manual(
    values = c("Winter" = "dodgerblue2", "Spring" = "palegreen2", "Summer" = "tan2","Autumn" = "tomato2")) +
  labs(y = "Daily Emergency Calls", x = "")+
  theme_custom 

brain_data %>%
  pairwise_t_test(EMS ~ Season, p.adjust.method = "holm") %>% 
  transmute(`Group 1` = group1, `Group 2` = group2, 
            `p-value` = p, `p.adjusted` = p.adj) %>%
  flextable() 

Group 1

Group 2

p-value

p.adjusted

Autumn

Spring

0.0162

0.0969

Autumn

Summer

0.1300

0.5200

Spring

Summer

0.3670

1.0000

Autumn

Winter

0.0259

0.1290

Spring

Winter

0.8650

1.0000

Summer

Winter

0.4670

1.0000

EMS ~ Age & Sex

  • All age groups show an increase in calls over time, peaking around 2020.
  • Age 25-44: Fewer calls, with a balanced male-female distribution.
  • Age 45-54: More calls than 25-44, but fewer than 55+; balanced gender distribution.
  • Age 55+: Highest number of calls, with females surpassing males, especially in recent years.
ggarrange(

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age != "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", show.legend = FALSE) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    title = "Ambulance Calls Distribution by Year, Age and Gender",
    x = "", y = "", fill = "Gender") +
  theme_custom + 
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("25-44" = "Age: 25-44 years", 
                                          "45-54" = "Age: 45-54 years"))), 

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age == "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", ) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    x = "", y = "Total EMS", fill = "Gender") +
  theme_custom + theme(legend.position = "inside", legend.justification = c(0.9, 0.2))+
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("55+" = "Age: 55+ years"))), 

nrow = 2

)

Local K index

Daily Average Kp index and Daily Average K index (IRT) show a strong correlation (r = 0.91). However, K index (IRT) has a smaller range and a more uniform distribution compared to Kp index.

brain_data %>% 
  drop_na(Kp_Sum, K_Sum) %>% 
  ggplot(aes(Kp_Sum, K_Sum))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Kp index", y = "Daily Average K index (IRT)")+
  theme_custom

brain_data %>%
  drop_na(Kp_Sum, K_Sum) %>% 
  pivot_longer(cols = c(Kp_Sum, K_Sum), names_to = "index") %>% 
  ggplot() +
  geom_histogram(aes(x = value, fill = index), 
                 binwidth = 5, colour = "black", alpha = 0.5, position = "identity", show.legend = FALSE) +
  scale_fill_manual(
    values = c("K_Sum" = "darkseagreen2", "Kp_Sum" = "steelblue2") )+
  scale_x_continuous(breaks = seq(0, 50, 10))+
  labs( x = "Index Values", y = "Observation Count", fill = "Index Type") +
  theme_custom +
  facet_wrap(~index, labeller = labeller(index = c("K_Sum" = "Daily Average K index (IRT)",
                                                   "Kp_Sum" = "Daily Average Kp index")))